{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from keras.preprocessing.text import one_hot, Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.models import Sequential\n",
    "from keras.wrappers.scikit_learn import KerasClassifier\n",
    "from keras.layers import Dense, Flatten, Embedding, LSTM, SpatialDropout1D, Input, Bidirectional,Dropout, Activation, GRU\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras.utils.np_utils import to_categorical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = pd.read_csv('Train_DataSet.csv')\n",
    "train_label = pd.read_csv('Train_DataSet_Label.csv')\n",
    "train = pd.merge(train_data, train_label, how='left', on='id')\n",
    "train = train[(train.label.notnull()) & (train.content.notnull())]\n",
    "test = pd.read_csv('Test_DataSet.csv')\n",
    "\n",
    "train['title'] = train['title'].fillna('')\n",
    "train['content'] = train['content'].fillna('')\n",
    "test['title'] = test['title'].fillna('')\n",
    "test['content'] = test['content'].fillna('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def filter(text):\n",
    "    text = re.sub(\"[A-Za-z0-9\\!\\=\\？\\%\\[\\]\\,\\（\\）\\>\\<:&lt;\\/#\\. -----\\_]\", \"\", text)\n",
    "    text = text.replace('图片', '')\n",
    "    text = text.replace('\\xa0', '') # 删除nbsp\n",
    "    # new\n",
    "    r1 =  \"\\\\【.*?】+|\\\\《.*?》+|\\\\#.*?#+|[.!/_,$&%^*()<>+\"\"'?@|:~{}#]+|[——！\\\\\\，。=？、：“”‘’￥……（）《》【】]\"\n",
    "    cleanr = re.compile('<.*?>')\n",
    "    text = re.sub(cleanr, ' ', text)        #去除html标签\n",
    "    text = re.sub(r1,'',text)\n",
    "    text = text.strip()\n",
    "    return text\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_text(data):\n",
    "    data['title'] = data['title'].apply(lambda x: filter(x))\n",
    "    data['content'] = data['content'].apply(lambda x: filter(x))\n",
    "    return data\n",
    "train = clean_text(train)\n",
    "test = clean_text(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stop_words = pd.read_table('stop.txt', header=None)[0].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba\n",
    "import string\n",
    "table = str.maketrans(\"\",\"\",string.punctuation)\n",
    "def cut_text(sentence):\n",
    "    tokens = list(jieba.cut(sentence))\n",
    "    # 去除停用词\n",
    "    tokens = [token for token in tokens if token not in stop_words]\n",
    "#     # 去除英文标点\n",
    "#     tokens = [w.translate(table) for w in tokens]\n",
    "    return tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_title = [cut_text(sent) for sent in train.title.values]\n",
    "train_content = [cut_text(sent) for sent in train.content.values]\n",
    "test_title = [cut_text(sent) for sent in test.title.values]\n",
    "test_content = [cut_text(sent) for sent in test.content.values]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_doc = train_title + train_content + test_title + test_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim\n",
    "import time\n",
    "class EpochSaver(gensim.models.callbacks.CallbackAny2Vec):\n",
    "    '''用于保存模型, 打印损失函数等等'''\n",
    "    def __init__(self, save_path):\n",
    "        self.save_path = save_path\n",
    "        self.epoch = 0\n",
    "        self.pre_loss = 0\n",
    "        self.best_loss = 999999999.9\n",
    "        self.since = time.time()\n",
    "\n",
    "    def on_epoch_end(self, model):\n",
    "        self.epoch += 1\n",
    "        cum_loss = model.get_latest_training_loss() # 返回的是从第一个epoch累计的\n",
    "        epoch_loss = cum_loss - self.pre_loss\n",
    "        time_taken = time.time() - self.since\n",
    "        print(\"Epoch %d, loss: %.2f, time: %dmin %ds\" % \n",
    "                    (self.epoch, epoch_loss, time_taken//60, time_taken%60))\n",
    "        if self.best_loss > epoch_loss:\n",
    "            self.best_loss = epoch_loss\n",
    "            print(\"Better model. Best loss: %.2f\" % self.best_loss)\n",
    "            model.save(self.save_path)\n",
    "            print(\"Model %s save done!\" % self.save_path)\n",
    "\n",
    "        self.pre_loss = cum_loss\n",
    "        self.since = time.time()\n",
    "# model_word2vec = gensim.models.Word2Vec.load('final_word2vec_model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_word2vec = gensim.models.Word2Vec(min_count=1, \n",
    "                                        window=5, \n",
    "                                        size=256,\n",
    "                                        workers=4,\n",
    "                                        batch_words=1000)\n",
    "since = time.time()\n",
    "model_word2vec.build_vocab(all_doc, progress_per=2000)\n",
    "time_elapsed = time.time() - since\n",
    "print('Time to build vocab: {:.0f}min {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "since = time.time()\n",
    "model_word2vec.train(all_doc, total_examples=model_word2vec.corpus_count, \n",
    "                        epochs=5, compute_loss=True, report_delay=60*10,\n",
    "                        callbacks=[EpochSaver('./final_word2vec_model')])\n",
    "time_elapsed = time.time() - since\n",
    "print('Time to train: {:.0f}min {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer()\n",
    "tokenizer.fit_on_texts(train_title + test_title)\n",
    "# tokenizer.fit_on_texts(train_content + test_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 转化成词向量矩阵，利用新的word2vec模型\n",
    "vocab_size = len(tokenizer.word_index)\n",
    "error_count=0\n",
    "embedding_matrix = np.zeros((vocab_size + 1, 256))\n",
    "for word, i in tqdm(tokenizer.word_index.items()):\n",
    "    if word in model_word2vec:\n",
    "        embedding_matrix[i] = model_word2vec.wv[word]\n",
    "    else:\n",
    "        error_count += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequence = tokenizer.texts_to_sequences(train_title)\n",
    "traintitle = pad_sequences(sequence, maxlen=30)\n",
    "sequence = tokenizer.texts_to_sequences(test_title)\n",
    "testtitle = pad_sequences(sequence, maxlen=30)\n",
    "# sequence = tokenizer.texts_to_sequences(train_content)\n",
    "# traincontent = pad_sequences(sequence, maxlen=512)\n",
    "# sequence = tokenizer.texts_to_sequences(test_content)\n",
    "# testcontent = pad_sequences(sequence, maxlen=512)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# BaseLine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "def metric_F1score(y_true,y_pred):    \n",
    "    TP=tf.reduce_sum(y_true*tf.round(y_pred))\n",
    "    TN=tf.reduce_sum((1-y_true)*(1-tf.round(y_pred)))\n",
    "    FP=tf.reduce_sum((1-y_true)*tf.round(y_pred))\n",
    "    FN=tf.reduce_sum(y_true*(1-tf.round(y_pred)))\n",
    "    precision=TP/(TP+FP)\n",
    "    recall=TP/(TP+FN)\n",
    "    F1score=2*precision*recall/(precision+recall)\n",
    "    return F1score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import keras\n",
    "model = Sequential()\n",
    "model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, \n",
    "                    output_dim=256, \n",
    "                    input_length=30, \n",
    "                    weights=[embedding_matrix]))\n",
    "model.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.1)))\n",
    "model.add(Dense(10))\n",
    "model.add(Dropout(0.35))\n",
    "model.add(Dense(3, activation='softmax'))\n",
    "optimizer = keras.optimizers.Adam(lr=0.005)\n",
    "model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])\n",
    "# learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', \n",
    "#                                             patience=3, \n",
    "#                                             verbose=1, \n",
    "#                                             factor=0.5, \n",
    "#                                             min_lr=0.00001)\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten\n",
    "from keras.layers import Bidirectional, GlobalMaxPool1D\n",
    "from keras.models import Model, Sequential\n",
    "from keras.layers import Convolution1D\n",
    "from keras import initializers, regularizers, constraints, optimizers, layers\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, \n",
    "                    output_dim=256, \n",
    "                    input_length=30, \n",
    "                    weights=[embedding_matrix]))\n",
    "model.add(Bidirectional(LSTM(32, return_sequences = True)))\n",
    "model.add(GlobalMaxPool1D())\n",
    "model.add(Dense(20, activation=\"relu\"))\n",
    "model.add(Dropout(0.05))\n",
    "model.add(Dense(3, activation=\"softmax\"))\n",
    "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TextCNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras import Input, Model\n",
    "from keras.layers import Embedding, Dense, Dropout, Bidirectional, LSTM\n",
    "\n",
    "from attention import Attention\n",
    "\n",
    "class TextAttBiRNN(object):\n",
    "    def __init__(self, maxlen, max_features, embedding_dims,\n",
    "                 class_num=1,\n",
    "                 last_activation='sigmoid'):\n",
    "        self.maxlen = maxlen\n",
    "        self.max_features = max_features\n",
    "        self.embedding_dims = embedding_dims\n",
    "        self.class_num = class_num\n",
    "        self.last_activation = last_activation\n",
    "\n",
    "    def get_model(self):\n",
    "        input = Input((self.maxlen,))\n",
    "\n",
    "        embedding = Embedding(self.max_features, self.embedding_dims, \n",
    "                              input_length=self.maxlen, weights=[embedding_matrix])(input)\n",
    "        x = Bidirectional(LSTM(128, return_sequences=True))(embedding)  # LSTM or GRU\n",
    "        x = Attention(self.maxlen)(x)\n",
    "\n",
    "        output = Dense(self.class_num, activation=self.last_activation)(x)\n",
    "        model = Model(inputs=input, outputs=output)\n",
    "        return model\n",
    "model = TextAttBiRNN(maxlen=30, max_features=len(tokenizer.word_index) + 1,\n",
    "                    embedding_dims=256, class_num=3, last_activation='softmax').get_model()\n",
    "model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# textCnn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras import Input, Model\n",
    "from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout\n",
    "class TextCNN(object):\n",
    "    def __init__(self, maxlen, max_features, embedding_dims,\n",
    "                 class_num=1,\n",
    "                 last_activation='sigmoid'):\n",
    "        self.maxlen = maxlen\n",
    "        self.max_features = max_features\n",
    "        self.embedding_dims = embedding_dims\n",
    "        self.class_num = class_num\n",
    "        self.last_activation = last_activation\n",
    "\n",
    "    def get_model(self):\n",
    "        input = Input((self.maxlen,))\n",
    "\n",
    "        # Embedding part can try multichannel as same as origin paper\n",
    "        embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen,\n",
    "                              weights=[embedding_matrix])(input)\n",
    "        convs = []\n",
    "        for kernel_size in [3, 4, 5]:\n",
    "            c = Conv1D(128, kernel_size, activation='relu')(embedding)\n",
    "            c = GlobalMaxPooling1D()(c)\n",
    "            convs.append(c)\n",
    "        x = Concatenate()(convs)\n",
    "\n",
    "        output = Dense(self.class_num, activation=self.last_activation)(x)\n",
    "        model = Model(inputs=input, outputs=output)\n",
    "        return model\n",
    "    \n",
    "model = TextCNN(maxlen=30, max_features=len(tokenizer.word_index) + 1,\n",
    "                    embedding_dims=256, class_num=3, last_activation='softmax').get_model()\n",
    "model.compile('adam', 'categorical_crossentropy', metrics=['accuracy',metric_F1score])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label = train['label'].astype(int)\n",
    "# labels = to_categorical(label) \n",
    "# train_X, val_X, train_Y, val_Y = train_test_split(traintitle, label, shuffle=True, test_size=0.2,\n",
    "#                                                     random_state=2019)\n",
    "train_X, val_X, train_Y, val_Y = train_test_split(traintitle, label, shuffle=True, test_size=0.2,\n",
    "                                                    random_state=2019)\n",
    "train_Y = to_categorical(train_Y)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(train_X,\n",
    "          train_Y,\n",
    "          batch_size=128,\n",
    "          epochs=10)\n",
    "# model.fit(traintitle,\n",
    "#           labels,\n",
    "#           batch_size=128,\n",
    "#           epochs=3,\n",
    "#           shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import f1_score\n",
    "pred_val = model.predict(val_X)\n",
    "print(f1_score(val_Y, np.argmax(pred_val, axis=1), average='macro'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds = np.argmax(model.predict(testtitle), axis=1)\n",
    "test['label'] = preds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test['label'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[test.label==0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[['id', 'label']].to_csv('baseline4.csv', index=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
